from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType

if __name__ == '__main__':
    # 构建SparkSession对象
    spark = SparkSession.builder. \
        appName("local[*]"). \
        config("spark.sql.shuffle.partitions", "4"). \
        getOrCreate()
    # appName 设置程序名称
    # config 设置常用属性。可以通过此来设置配置
    # 最后通过getOrCreate 创建 SparkSession对象

    # 从SparkSession中获取SparkContext
    sc = spark.sparkContext

    schema = StructType().add("data", StringType(), nullable=True)

    df = spark.read.format("text").load("../../data/sql/people.txt")
    df.printSchema()
    df.show(truncate=False)

    # 打印结果为：默认列名为value
    # +-----------+
    # |value      |
    # +-----------+
    # |Michael, 29|
    # |Andy, 30   |
    # |Justin, 19 |
    # +-----------+

    df = spark.read.format("text").schema(schema).load("../../data/sql/people.txt")
    df.printSchema()
    df.show(truncate=False)
    # 打印结果为：
    # +-----------+
    # |data       |
    # +-----------+
    # |Michael, 29|
    # |Andy, 30   |
    # |Justin, 19 |
    # +-----------+